#load packages
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(ggplot2)
library(forcats)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(HH)
## Loading required package: grid
## Loading required package: latticeExtra
## Loading required package: RColorBrewer
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
## Loading required package: multcomp
## Loading required package: mvtnorm
## Loading required package: TH.data
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
## Attaching package: 'TH.data'
## The following object is masked from 'package:MASS':
##
## geyser
## Loading required package: gridExtra
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
##
## Attaching package: 'HH'
## The following object is masked from 'package:purrr':
##
## transpose
library(mi)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: stats4
## mi (Version 1.0, packaged: 2015-04-16 14:03:10 UTC; goodrich)
## mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
##
## Attaching package: 'mi'
## The following object is masked from 'package:tidyr':
##
## complete
library(extracat)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(rapportools)
## Loading required package: reshape
##
## Attaching package: 'reshape'
## The following object is masked from 'package:Matrix':
##
## expand
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
##
## Attaching package: 'rapportools'
## The following object is masked from 'package:Matrix':
##
## mean
## The following objects are masked from 'package:Hmisc':
##
## label, label<-
## The following object is masked from 'package:dplyr':
##
## n
## The following objects are masked from 'package:stats':
##
## IQR, median, sd, var
## The following objects are masked from 'package:base':
##
## max, mean, min, range, sum
library(vcd)
##
## Attaching package: 'vcd'
## The following object is masked from 'package:HH':
##
## odds
## The following object is masked from 'package:latticeExtra':
##
## rootogram
# set color
mycolor <- "#80593D"
myfill <- "#9FC29F"
# load data from moma and met
moma_artists <- read_csv("../data/raw/moma/Artists.csv")
## Parsed with column specification:
## cols(
## ConstituentID = col_double(),
## DisplayName = col_character(),
## ArtistBio = col_character(),
## Nationality = col_character(),
## Gender = col_character(),
## BeginDate = col_double(),
## EndDate = col_double(),
## `Wiki QID` = col_character(),
## ULAN = col_double()
## )
moma_artworks <- read_csv("../data/raw/moma/Artworks.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## DateAcquired = col_date(format = ""),
## ObjectID = col_double(),
## `Circumference (cm)` = col_logical(),
## `Depth (cm)` = col_double(),
## `Diameter (cm)` = col_double(),
## `Height (cm)` = col_double(),
## `Length (cm)` = col_logical(),
## `Weight (kg)` = col_double(),
## `Width (cm)` = col_double(),
## `Seat Height (cm)` = col_logical(),
## `Duration (sec.)` = col_logical()
## )
## See spec(...) for full column specifications.
## Warning: 3664 parsing failures.
## row col expected actual file
## 1002 Length (cm) 1/0/T/F/TRUE/FALSE 17.8 '../data/raw/moma/Artworks.csv'
## 1003 Length (cm) 1/0/T/F/TRUE/FALSE 19.0 '../data/raw/moma/Artworks.csv'
## 1004 Length (cm) 1/0/T/F/TRUE/FALSE 18.1 '../data/raw/moma/Artworks.csv'
## 1005 Length (cm) 1/0/T/F/TRUE/FALSE 17.78 '../data/raw/moma/Artworks.csv'
## 1007 Length (cm) 1/0/T/F/TRUE/FALSE 17.8 '../data/raw/moma/Artworks.csv'
## .... ........... .................. ...... ...............................
## See problems(...) for more details.
# 1. Missing data
# 1.1 MOMA
# cut out columns that we dont need
str(moma_artists)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 15853 obs. of 9 variables:
## $ ConstituentID: num 1 2 3 4 5 6 7 9 10 11 ...
## $ DisplayName : chr "Robert Arneson" "Doroteo Arnaiz" "Bill Arnold" "Charles Arnoldi" ...
## $ ArtistBio : chr "American, 1930–1992" "Spanish, born 1936" "American, born 1941" "American, born 1946" ...
## $ Nationality : chr "American" "Spanish" "American" "American" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ BeginDate : num 1930 1936 1941 1946 1941 ...
## $ EndDate : num 1992 0 0 0 0 ...
## $ Wiki QID : chr NA NA NA "Q1063584" ...
## $ ULAN : num NA NA NA 5e+08 NA ...
## - attr(*, "spec")=
## .. cols(
## .. ConstituentID = col_double(),
## .. DisplayName = col_character(),
## .. ArtistBio = col_character(),
## .. Nationality = col_character(),
## .. Gender = col_character(),
## .. BeginDate = col_double(),
## .. EndDate = col_double(),
## .. `Wiki QID` = col_character(),
## .. ULAN = col_double()
## .. )
This is a dataset describing 15,853 observations with 9 columns. However, we can quickly notice that some columns are either redundant or seemingly meaningless for exploratory analysis, such as ConstituentID, ArtistBio (which is combining Nationality, BeginDate and EndDate), Wiki QID, ULAN; as a result, we will just go ahead and drop them.
# we first explore the artists dataset, we only include: DisplayName, Nationality, Gender, BeginDate, and EndDate
moma_artists <- moma_artists[c(2,4,5,6,7)]
str(moma_artists)
## Classes 'tbl_df', 'tbl' and 'data.frame': 15853 obs. of 5 variables:
## $ DisplayName: chr "Robert Arneson" "Doroteo Arnaiz" "Bill Arnold" "Charles Arnoldi" ...
## $ Nationality: chr "American" "Spanish" "American" "American" ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ BeginDate : num 1930 1936 1941 1946 1941 ...
## $ EndDate : num 1992 0 0 0 0 ...
Now, we just have 5 columns with the same number of observations: Artist Display Name, Nationality, Gender, Artist Birth Year, Artist Death Year. We will proceed to clean up the datasets and observed the patterns in the missing data.
# quick scan
# We will do a quick scan first:
colSums(is.na(moma_artists))
## DisplayName Nationality Gender BeginDate EndDate
## 0 2556 3179 0 0
From the report, it says that only Nationality and Gender are the columns with missing value. This seems odd as it is nearly impossible for BeginDate (Birth Year) and EndDate (Death Year) to not have any missing data. Thus, we go back and eyeball the dataset. From the process, we realize that the missing values in BeginDate and EndDate columns are denoted as 0 instead NA. That said, we will replace the 0s with NAs.
# clean up BeginDate and EndDate columns
moma_clean <- moma_artists
moma_clean$BeginDate[(moma_clean$BeginDate == 0)] <- NA
moma_clean$EndDate[(moma_clean$EndDate == 0)] <- NA
# now check again
colSums(is.na(moma_clean))
## DisplayName Nationality Gender BeginDate EndDate
## 0 2556 3179 3791 10779
After the clean up, now the NAs are revealed. There are 3,791 missing data in the column BeginDate and 10,799 missing in the EndDate column. This quite literally suggests that with the artists being featured in MOMA. More than half of them (~68%) are missing death years, and less than one-fourth (~24%) of them are missing birth years. Perhaps, there is difficulty with MOMA to collect full bio of the artists being featured in the museum, especially with the death years of the artists.
# group the other three columns and see if there is any NAs that are not tracked by NA due to human error while entering the data
nationality_na <- moma_clean %>% group_by(Nationality) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
gender_na <- moma_clean %>% group_by (Gender) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
name_na <- moma_clean %>% group_by(DisplayName) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
# by manually search "unknown" and related keywords in the dataset, we caught on error: NAs in Nationality are also denoted as "nationality unknown"
moma_clean$Nationality[moma_clean$Nationality == "Nationality unknown"] <- NA
# Similarly, with the Name column, there are also some bad format NAs
moma_clean$DisplayName <- tolower(moma_clean$DisplayName)
moma_clean$DisplayName[str_detect(moma_clean$DisplayName, "unknown") == TRUE] <- NA
# with the gender column, we noticed that the biggest problem is with lowercase and upper case; also, since there is only 1 data observation on binary gender, we might consider dropping it later
moma_clean$Gender <- tolower(moma_clean$Gender)
colSums(is.na(moma_clean))
## DisplayName Nationality Gender BeginDate EndDate
## 53 2736 3179 3791 10779
We also checked the other three columns, Display Name, Nationality, and Gender, in case there are some human errors (such as entering the data in the wrong format) that might lead to NAs not being tracked by correctly. In fact, there were some for the Display Name and Nationality columns. Nas in the Display column are denoted with characters spelling out in words. Similarly, NAs in the Nationality column are also denoted in words. After cleaning them up, we uncovered 53 NAs in the Display Name column, 180 NAs in the Nationality column and above is the result of complete missing data values in this dataset. For the Gender column, no NAs are uncovered, but there is upper case and lower case issue such as “male” and “Male”, so we just turn all the gender types to lower case.
The result above is the total sum of all the missing data in this dataset. We will plot a graph to better understand the missing patterns.
# plot missing patterns
visna(moma_clean, sort = 'r')
## Warning in melt(as.data.frame(xs), ncol(xs)): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(as.data.frame(xs)). In the next version,
## this warning will become an error.
Concluded from the graph above, most rows are missing just the EndDate column, which means missing the death year of an artist. Not missing any data is the second most common pattern in this dataset. But surprisingly, missing all of the columns (other than Display Name) is the third most common patterns. Thus, there are quite decent amount of rows that we cannot use (since they are missing four of the columns that we want to look into). We will now proceed to explore the dataset while taken into account of the missing patterns.
## 1.2. missing data from moma artworks dataset
str(moma_artworks)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 138124 obs. of 29 variables:
## $ Title : chr "Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)" "City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard" "Villa near Vienna Project, Outside Vienna, Austria, Elevation" "The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park" ...
## $ Artist : chr "Otto Wagner" "Christian de Portzamparc" "Emil Hoppe" "Bernard Tschumi" ...
## $ ConstituentID : chr "6210" "7470" "7605" "7056" ...
## $ ArtistBio : chr "(Austrian, 1841–1918)" "(French, born 1944)" "(Austrian, 1876–1957)" "(French and Swiss, born Switzerland 1944)" ...
## $ Nationality : chr "(Austrian)" "(French)" "(Austrian)" "()" ...
## $ BeginDate : chr "(1841)" "(1944)" "(1876)" "(1944)" ...
## $ EndDate : chr "(1918)" "(0)" "(1957)" "(0)" ...
## $ Gender : chr "(Male)" "(Male)" "(Male)" "(Male)" ...
## $ Date : chr "1896" "1987" "1903" "1980" ...
## $ Medium : chr "Ink and cut-and-pasted painted pages on paper" "Paint and colored pencil on print" "Graphite, pen, color pencil, ink, and gouache on tracing paper" "Photographic reproduction with colored synthetic laminate" ...
## $ Dimensions : chr "19 1/8 x 66 1/2\" (48.6 x 168.9 cm)" "16 x 11 3/4\" (40.6 x 29.8 cm)" "13 1/2 x 12 1/2\" (34.3 x 31.8 cm)" "20 x 20\" (50.8 x 50.8 cm)" ...
## $ CreditLine : chr "Fractional and promised gift of Jo Carole and Ronald S. Lauder" "Gift of the architect in honor of Lily Auchincloss" "Gift of Jo Carole and Ronald S. Lauder" "Purchase and partial gift of the architect in honor of Lily Auchincloss" ...
## $ AccessionNumber : chr "885.1996" "1.1995" "1.1997" "2.1995" ...
## $ Classification : chr "Architecture" "Architecture" "Architecture" "Architecture" ...
## $ Department : chr "Architecture & Design" "Architecture & Design" "Architecture & Design" "Architecture & Design" ...
## $ DateAcquired : Date, format: "1996-04-09" "1995-01-17" ...
## $ Cataloged : chr "Y" "Y" "Y" "Y" ...
## $ ObjectID : num 2 3 4 5 6 7 8 9 10 11 ...
## $ URL : chr "http://www.moma.org/collection/works/2" "http://www.moma.org/collection/works/3" "http://www.moma.org/collection/works/4" "http://www.moma.org/collection/works/5" ...
## $ ThumbnailURL : chr "http://www.moma.org/media/W1siZiIsIjU5NDA1Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=137b8455b1ec6167" "http://www.moma.org/media/W1siZiIsIjk3Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=55b65fa4368fe00a" "http://www.moma.org/media/W1siZiIsIjk4Il0sWyJwIiwiY29udmVydCIsIi1yZXNpemUgMzAweDMwMFx1MDAzZSJdXQ.jpg?sha=fdcfca4db3acac1f" "http://www.moma.org/media/W1siZiIsIjEyNCJdLFsicCIsImNvbnZlcnQiLCItcmVzaXplIDMwMHgzMDBcdTAwM2UiXV0.jpg?sha=c89b9071486760a5" ...
## $ Circumference (cm): logi NA NA NA NA NA NA ...
## $ Depth (cm) : num NA NA NA NA NA NA NA NA NA NA ...
## $ Diameter (cm) : num NA NA NA NA NA NA NA NA NA NA ...
## $ Height (cm) : num 48.6 40.6 34.3 50.8 38.4 ...
## $ Length (cm) : logi NA NA NA NA NA NA ...
## $ Weight (kg) : num NA NA NA NA NA NA NA NA NA NA ...
## $ Width (cm) : num 168.9 29.8 31.8 50.8 19.1 ...
## $ Seat Height (cm) : logi NA NA NA NA NA NA ...
## $ Duration (sec.) : logi NA NA NA NA NA NA ...
## - attr(*, "problems")=Classes 'tbl_df', 'tbl' and 'data.frame': 3664 obs. of 5 variables:
## ..$ row : int 1002 1003 1004 1005 1007 1008 1011 1041 1042 1045 ...
## ..$ col : chr "Length (cm)" "Length (cm)" "Length (cm)" "Length (cm)" ...
## ..$ expected: chr "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
## ..$ actual : chr "17.8" "19.0" "18.1" "17.78" ...
## ..$ file : chr "'../data/raw/moma/Artworks.csv'" "'../data/raw/moma/Artworks.csv'" "'../data/raw/moma/Artworks.csv'" "'../data/raw/moma/Artworks.csv'" ...
## - attr(*, "spec")=
## .. cols(
## .. Title = col_character(),
## .. Artist = col_character(),
## .. ConstituentID = col_character(),
## .. ArtistBio = col_character(),
## .. Nationality = col_character(),
## .. BeginDate = col_character(),
## .. EndDate = col_character(),
## .. Gender = col_character(),
## .. Date = col_character(),
## .. Medium = col_character(),
## .. Dimensions = col_character(),
## .. CreditLine = col_character(),
## .. AccessionNumber = col_character(),
## .. Classification = col_character(),
## .. Department = col_character(),
## .. DateAcquired = col_date(format = ""),
## .. Cataloged = col_character(),
## .. ObjectID = col_double(),
## .. URL = col_character(),
## .. ThumbnailURL = col_character(),
## .. `Circumference (cm)` = col_logical(),
## .. `Depth (cm)` = col_double(),
## .. `Diameter (cm)` = col_double(),
## .. `Height (cm)` = col_double(),
## .. `Length (cm)` = col_logical(),
## .. `Weight (kg)` = col_double(),
## .. `Width (cm)` = col_double(),
## .. `Seat Height (cm)` = col_logical(),
## .. `Duration (sec.)` = col_logical()
## .. )
We have 138124 observations with 29 variables. But again, some columns are redundant or not significantly meaningful in our case, so we will drop those.
# we chose title, artist, beginDate, endDate, Gender, Date Made, Medium, Classification, Acquisition Year
artworks_clean <- moma_artworks[c(1,2,5,6,7,8,9,10,14,16)]
# check missing
colSums(is.na(artworks_clean))
## Title Artist Nationality BeginDate EndDate
## 39 1455 1455 1455 1455
## Gender Date Medium Classification DateAcquired
## 1455 2370 10963 0 6741
So we almost have missings from all columns other than Classification column.
# fix NA format
# clean data that are bracked by ()
artworks_clean$Nationality <- removePunctuation(artworks_clean$Nationality)
artworks_clean$BeginDate <- removePunctuation(artworks_clean$BeginDate)
artworks_clean$EndDate <- removePunctuation(artworks_clean$EndDate)
artworks_clean$Gender <- removePunctuation(artworks_clean$Gender)
# add columns for acquisition year
artworks_clean <- artworks_clean %>% mutate(YearAcquired = substr(DateAcquired,1,4))
# for clarity, we remove the original column
artworks_clean <- dplyr::select(artworks_clean,-DateAcquired)
# Nationality and Gender columns have many empty rows so we use *is.empty* from *rapportools*
artworks_clean$Nationality[is.empty(artworks_clean$Nationality) == TRUE] <- NA
artworks_clean$Gender[is.empty(artworks_clean$Gender) == TRUE] <- NA
artworks_clean$Gender <- tolower(artworks_clean$Gender)
# BeginDate and EndDate have many 0 rows to indicate NAs
artworks_clean$BeginDate[artworks_clean$BeginDate == 0] <- NA
artworks_clean$EndDate[artworks_clean$EndDate == 0] <- NA
# clean "unknown" in Date column
artworks_clean$Date[artworks_clean$Date == "unknown"] <- NA
# we use the same mechanism that detect the most number of NAs (increase false negative, but make sure we increase true positive)
artworks_clean$Title[str_detect(artworks_clean$Title, "unknown") == TRUE] <- NA
artworks_clean$Artist[str_detect(artworks_clean$Artist, "unknown") == TRUE] <- NA
artworks_clean$Medium[str_detect(artworks_clean$Medium, "unknown") == TRUE] <- NA
artworks_clean$Title[str_detect(artworks_clean$Title, "Unknown") == TRUE] <- NA
artworks_clean$Artist[str_detect(artworks_clean$Artist, "Unknown") == TRUE] <- NA
artworks_clean$Medium[str_detect(artworks_clean$Medium, "Unknown") == TRUE] <- NA
artworks_clean$Nationality[str_detect(artworks_clean$Nationality, "Unknown") == TRUE] <- NA
artworks_clean$Nationality[str_detect(artworks_clean$Nationality, "unknown") == TRUE] <- NA
artworks_clean$Date[artworks_clean$Date == "n.d."] <- NA
colSums(is.na(artworks_clean))
## Title Artist Nationality BeginDate EndDate
## 85 5255 7448 9150 48431
## Gender Date Medium Classification YearAcquired
## 8518 3144 11005 0 6741
visna(artworks_clean, sort = "b")
## Warning in melt(as.data.frame(xs), ncol(xs)): The melt generic in
## data.table has been passed a data.frame and will attempt to redirect to the
## relevant reshape2 method; please note that reshape2 is deprecated, and this
## redirection is now deprecated as well. To continue using melt methods from
## reshape2 while both libraries are attached, e.g. melt.list, you can prepend
## the namespace like reshape2::melt(as.data.frame(xs)). In the next version,
## this warning will become an error.
# just to be safe, we group them again.
date_test <- artworks_clean %>% group_by(Date) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
aqr_test <- artworks_clean %>% group_by(YearAcquired) %>% dplyr::summarise(Total = dplyr::n()) %>% arrange(Total)
From the graph, EndDate is the column with most missing data, followed by Medium, BeginDate and Gender. To our surprise, Year Acquisition and Date Made have much fewer missing rows than other columns. This can point to the fact that when MOMA collected art pieces, they cared more about when the object was made, rather than the biography of the artist himself or herself (such as birth year, death year, gender, or even medium!)
# First, what is MOMA's preference on Nationality?
moma_org <- moma_clean[2] %>% group_by(Nationality) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# check the stats
describe(moma_org)
## moma_org
##
## 2 Variables 127 Observations
## ---------------------------------------------------------------------------
## Nationality
## n missing distinct
## 126 1 126
##
## lowest : Afghan Albanian Algerian American Argentine
## highest: Venezuelan Vietnamese Welsh Yugoslav Zimbabwean
## ---------------------------------------------------------------------------
## Frequency
## n missing distinct Info Mean Gmd .05 .10
## 127 0 53 0.986 124.8 225.3 1.0 1.0
## .25 .50 .75 .90 .95
## 2.0 6.0 44.0 165.0 446.2
##
## lowest : 1 2 3 4 5, highest: 872 880 977 2736 5472
## ---------------------------------------------------------------------------
Of the 15,853 observations (including NAs), 95% of the data are less than the frequency of the top 5% of the nationalities, where the cut off is only 446 counts. Thus, we are definitely seeing a long-tail effect, that of the 15,853 observations, most of them are from the same countries on the top 5% list.
# take the top 5% that can represent the nationality column
moma_org5 <- moma_org %>% group_by (Frequency) %>% filter(Frequency > 446)
# filter out na for now
moma_org5 <- moma_org5 %>% group_by(Nationality) %>% filter(is.na(Nationality) == FALSE)
# frequency of nationality
ggplot(moma_org5, aes(fct_reorder(Nationality, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA All Time Favorite Nationality", subtitle = "Top 5% Frequency of Nationality") +
xlab("Nationality") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
We observed that the number of American artists is nearly 5 times the number of artists from other countries (5472 counts), which means one-third of the artists featured by MOMA are from the U.S..
# if by top 10%, which is 165
moma_org10 <- moma_org %>% group_by (Frequency) %>% filter(Frequency > 165)
# filter out NA for now
moma_org10 <- moma_org10 %>% group_by (Nationality) %>% filter(is.na(Nationality)== FALSE)
# frequency of nationality
ggplot(moma_org10, aes(fct_reorder(Nationality, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA Artist Nationality by Frequency (top 10%)") +
xlab("Nationality") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
Thus, we observed that among the 90% of the artworks collected by MOMA (which are 12 countries), 8 are European countries, 2 are North American countries, 1 is South American and 1 is Asian.
# Now, let's check by gender of those top 5%
moma_org5_gender <- moma_clean %>% group_by(Gender, Nationality) %>% dplyr::summarize(Frequency= dplyr::n()) %>% arrange(Frequency) %>% filter(Nationality %in% c("American", "British", "German", "Italian", "Japanese"))
#if exclude NA for now
moma_org5_gender <- moma_org5_gender %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
ggplot(moma_org5_gender, aes(Nationality, Frequency, fill = Gender)) +
geom_bar(stat = "identity") +
ggtitle("MOMA All Time Favorite Nationality, Colored by Gender (top 5%)") +
xlab("Nationality") +
scale_fill_brewer(palette= "Oranges")+
geom_col(color = mycolor) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# dplyr::ext, we look at Birth Year
moma_birthy <- moma_clean[4] %>% group_by (BeginDate) %>% dplyr::summarize(Frequency= dplyr::n()) %>% arrange(Frequency)
moma_birthy$BeginDate <- as.character(moma_birthy$BeginDate)
# again, check the stats
describe(moma_birthy)
## moma_birthy
##
## 2 Variables 234 Observations
## ---------------------------------------------------------------------------
## BeginDate
## n missing distinct
## 233 1 233
##
## lowest : 1730 1731 1746 1753 1765, highest: 2012 2014 2015 2016 2017
## ---------------------------------------------------------------------------
## Frequency
## n missing distinct Info Mean Gmd .05 .10
## 234 0 105 0.999 67.75 90.84 1.00 2.00
## .25 .50 .75 .90 .95
## 6.00 22.50 94.25 138.40 153.00
##
## Value 0 50 100 150 200 3800
## Frequency 121 33 42 33 4 1
## Proportion 0.517 0.141 0.179 0.141 0.017 0.004
## ---------------------------------------------------------------------------
Again, we take the 95% percentile, and see a similar result, that 95% of the birth years are fewer than 153 counts, which means most of the artists are from similar or identical birh years, which is quite surprising.
# let's see the top 5%
moma_birthy5 <- moma_birthy %>% group_by(Frequency) %>% filter(Frequency > 153)
# and exclude NA for now
moma_birthy5 <- moma_birthy5 %>% group_by(BeginDate) %>% filter(is.na(BeginDate) == FALSE)
# frequency of birth year
ggplot(moma_birthy5, aes(BeginDate, Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA Artist Birth Year by Frequency (top 5%)") +
xlab("Birth Year") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
We observed that these ten consequentive years (excluding 1945) accounts for the birth year of the artists that are most recognized by MOMA. Perhaps, these ten years really made artists creative :)
# let's see the top 10%, which is greater than 138
moma_birthy10 <- moma_birthy %>% group_by(Frequency) %>% filter(Frequency > 138)
# and exclude NA for now
moma_birthy10 <- moma_birthy10 %>% group_by(BeginDate) %>% filter(is.na(BeginDate) == FALSE)
# frequency of birth year
ggplot(moma_birthy10, aes(BeginDate, Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA Artist Birth Year by Frequency (top 10%)") +
xlab("Birth Year") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
Different from Nationality though, birth year is in fact a range of years with similar frequencies. For sake of simplicity, we take the years starting from (whatever years we decide)
# we plot by gender
moma_birthy5_gender <- moma_clean %>% group_by(Gender, BeginDate) %>% dplyr::summarize(Frequency=dplyr::n())
# just check the top 5%
moma_birthy5_gender$BeginDate <- factor(moma_birthy5_gender$BeginDate, levels = moma_birthy5$BeginDate)
# take out NAs
moma_birthy5_gender <- moma_birthy5_gender %>% group_by(BeginDate) %>% filter( !is.na(BeginDate))
## Warning: Factor `BeginDate` contains implicit NA, consider using
## `forcats::fct_explicit_na`
moma_birthy5_gender <- moma_birthy5_gender %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
# plot it
ggplot(moma_birthy5_gender, aes(BeginDate, Frequency, fill = Gender)) +
geom_bar(stat = "identity") +
ggtitle("MOMA Artist Birth Year by Frequency, Colored by Gender (top 5%)") +
xlab("Birth Year") +
scale_fill_brewer(palette= "Oranges")+
geom_col(color = mycolor) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if plot on time series
moma_birthy_gen_time <- moma_clean %>% group_by(Gender, BeginDate) %>% dplyr::summarize(Frequency=dplyr::n())
moma_birthy_gen_time$BeginDate <- strtoi(moma_birthy_gen_time$BeginDate)
# remove NAs
moma_birthy_gen_time <- moma_birthy_gen_time %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
moma_birthy_gen_time <- moma_birthy_gen_time %>% group_by(BeginDate) %>% filter(BeginDate >= 1900)
ggplot(moma_birthy_gen_time, aes(BeginDate, Frequency, color = Gender)) +
geom_line() +
ggtitle("MOMA Female-Male Birth Year Comparison") +
theme_grey(14) +
theme(legend.title = element_blank()) +
labs(x = "Year", y = "Frequency")
# calculate female-male ratio
moma_birthy_gen_ratio <- moma_clean
moma_birthy_gen_ratio <- moma_birthy_gen_ratio %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
moma_birthy_gen_ratio <- moma_birthy_gen_ratio%>% group_by(BeginDate, Gender) %>% dplyr::summarise(Frequency = dplyr::n())
moma_birthy_gen_ratio <- moma_birthy_gen_ratio %>% group_by(BeginDate) %>% filter(BeginDate >= 1900 & BeginDate <= 1980)
moma_birthy_gen_ratio <- moma_birthy_gen_ratio %>% group_by(BeginDate) %>% dplyr::mutate(Ratio = 100*Frequency[Gender=="female"]/Frequency[Gender == "male"])
# plot on ratio
ggplot(moma_birthy_gen_ratio, aes(BeginDate, Ratio)) +
geom_line(color = "blue") +
ggtitle("Gender Ratio Peaked at Year 1971 (62%)!", subtitle = "Timeseries Graph on Birth Year Female-Male Ratio (1900 - 1980)") +
labs(x = "Year", y = "Ratio (%)") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
# Name analysis
moma_artist_name <- moma_clean %>% group_by(BeginDate) %>% filter(BeginDate >= 1937 & BeginDate <= 1947)
# Capitalized for visualization
moma_artist_name$DisplayName <- capitalize(moma_artist_name$DisplayName)
# 1.4.1 On Nationality
# first, a high level pic
# moma_artists_sub_org <- moma_artists_sub %>% group_by(Nationality) %>% dplyr::summarize(Frequency = dplyr::n()) %>% arrange(Frequency)
# NA only takes a small proportion, so we exclude it
# moma_artists_sub_org <- moma_artists_sub_org %>% group_by(Nationality) %>% filter(is.na(Nationality) == FALSE)
They collected a lot from American artists. But who are they? Can we tell?
moma_artists_sub_us <- moma_artist_name %>% group_by(Nationality) %>% filter(Nationality == 'American')
first_name_us <- as.data.frame(word(moma_artists_sub_us$DisplayName,1))
colnames(first_name_us) <- 'first_name'
first_name_count <- first_name_us %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# first name frequency
ggplot(tail(first_name_count,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's Favorite American First Names, by Frequency, 1937 - 1947") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# let's check last name
last_name_us <- as.data.frame(word(moma_artists_sub_us$DisplayName,-1))
colnames(last_name_us) <- 'last_name'
last_name_count <- last_name_us %>% group_by(last_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# first name frequency
ggplot(tail(last_name_count,10), aes(fct_reorder(last_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's Favorite American Last Names, by Frequency, 1937 - 1947") +
xlab("Last Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if all
all_us <- moma_clean %>% group_by(Nationality) %>% filter(Nationality == 'American')
first_name_all <- as.data.frame(word(all_us$DisplayName,1))
colnames(first_name_all) <- 'first_name'
first_name_allc <- first_name_all %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
## Warning: Factor `first_name` contains implicit NA, consider using
## `forcats::fct_explicit_na`
#first_name_allc$first_name <- capitalise(first_name_allc$first_name)
# first name frequency
ggplot(tail(first_name_allc,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's All Time Favorite American First Names, by Frequency") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if all
all_de <- moma_artists %>% group_by(Nationality) %>% filter(Nationality == 'German')
first_name_all_de <- as.data.frame(word(all_de$DisplayName,1))
colnames(first_name_all_de) <- 'first_name'
first_name_allc_de <- first_name_all_de %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
#first_name_allc_de$first_name <- capitalize(first_name_allc_de$first_name)
# first name frequency
ggplot(tail(first_name_allc_de,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's All Time Favorite German First Names, by Frequency") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if all
all_fr <- moma_artists %>% group_by(Nationality) %>% filter(Nationality == 'French')
first_name_all_fr <- as.data.frame(word(all_fr$DisplayName,1))
colnames(first_name_all_fr) <- 'first_name'
first_name_allc_fr <- first_name_all_fr %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# first name frequency
ggplot(tail(first_name_allc_fr,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's All Time Favorite French First Names, by Frequency") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if all
all_uk <- moma_artists %>% group_by(Nationality) %>% filter(Nationality == 'British')
first_name_all_uk <- as.data.frame(word(all_uk$DisplayName,1))
colnames(first_name_all_uk) <- 'first_name'
first_name_allc_uk <- first_name_all_uk %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# first name frequency
ggplot(tail(first_name_allc_uk,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's All Time Favorite British First Names, by Frequency") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# if all
all_jp <- moma_artists %>% group_by(Nationality) %>% filter(Nationality == 'Japanese')
first_name_all_jp <- as.data.frame(word(all_jp$DisplayName,1))
colnames(first_name_all_jp) <- 'first_name'
first_name_allc_jp <- first_name_all_jp %>% group_by(first_name) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# first name frequency
ggplot(tail(first_name_allc_jp,10), aes(fct_reorder(first_name, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA's All Time Favorite Japanese First Names, by Frequency") +
xlab("First Name") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
# explore moma artworks
# to be clear though, for the sake of simplicity, we will only focus on artworks that made by single artists; so we drop all works made by multi artists
# Some pieces are done by more than one artist; check how many are there to decide what to do with them
artworks_single <- artworks_clean %>% mutate(NumberArtists = lengths(strsplit(Artist, ",")))
artworks_single <- artworks_single %>% mutate(NumberType = cut(NumberArtists, breaks = c(0,1,Inf), labels = c("Single", "Multiple")))
# art_collab <- moma_artworks %>% group_by(CollabClass) %>% dplyr::summarise(Total = dplyr::n())
artworks_single <- artworks_single %>% group_by(NumberType) %>% filter(NumberType == 'Single')
str(artworks_single)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 130174 obs. of 12 variables:
## $ Title : chr "Ferdinandsbrücke Project, Vienna, Austria (Elevation, preliminary version)" "City of Music, National Superior Conservatory of Music and Dance, Paris, France, View from interior courtyard" "Villa near Vienna Project, Outside Vienna, Austria, Elevation" "The Manhattan Transcripts Project, New York, New York, Introductory panel to Episode 1: The Park" ...
## $ Artist : chr "Otto Wagner" "Christian de Portzamparc" "Emil Hoppe" "Bernard Tschumi" ...
## $ Nationality : chr "Austrian" "French" "Austrian" NA ...
## $ BeginDate : chr "1841" "1944" "1876" "1944" ...
## $ EndDate : chr "1918" NA "1957" NA ...
## $ Gender : chr "male" "male" "male" "male" ...
## $ Date : chr "1896" "1987" "1903" "1980" ...
## $ Medium : chr "Ink and cut-and-pasted painted pages on paper" "Paint and colored pencil on print" "Graphite, pen, color pencil, ink, and gouache on tracing paper" "Photographic reproduction with colored synthetic laminate" ...
## $ Classification: chr "Architecture" "Architecture" "Architecture" "Architecture" ...
## $ YearAcquired : chr "1996" "1995" "1997" "1995" ...
## $ NumberArtists : int 1 1 1 1 1 1 1 1 1 1 ...
## $ NumberType : Factor w/ 2 levels "Single","Multiple": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 1 obs. of 2 variables:
## ..$ NumberType: Factor w/ 2 levels "Single","Multiple": 1
## ..$ .rows :List of 1
## .. ..$ : int 1 2 3 4 5 6 7 8 9 10 ...
We have now 130174 observations.
artworks_org <- artworks_single[3] %>% group_by(Nationality) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
describe(artworks_org)
## artworks_org
##
## 2 Variables 143 Observations
## ---------------------------------------------------------------------------
## Nationality
## n missing distinct
## 142 1 142
##
## lowest : American American American American American American American French American German
## highest: Venezuelan Vietnamese Welsh Yugoslav Zimbabwean
## ---------------------------------------------------------------------------
## Frequency
## n missing distinct Info Mean Gmd .05 .10
## 143 0 76 0.991 910.3 1720 1.0 1.0
## .25 .50 .75 .90 .95
## 2.0 13.0 130.5 804.8 2359.3
##
## lowest : 1 2 3 4 5, highest: 5585 7242 9270 22530 57234
## ---------------------------------------------------------------------------
Take last ten.
# exclude NA
artworks_org <- artworks_org %>% group_by (Nationality) %>% filter(is.na(Nationality) == FALSE)
# frequency of nationality
ggplot(tail(artworks_org,10), aes(fct_reorder(Nationality, Frequency), Frequency)) +
geom_bar(stat = "identity") +
ggtitle("MOMA artworks mostly contributed by ", subtitle = "Top 10 highest Nationality, by Frequency") +
xlab("Nationality") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
So will some nationality contribute more artworks even though under represented? NO.
# lets also look at gender
artworks_gender <- artworks_single %>% group_by(Gender, Nationality) %>% dplyr::summarize(Frequency= dplyr::n()) %>% arrange(Frequency) %>% filter(Nationality %in% c("American", "French", "German","British", "Spanish", "Italian", "Japanese","Swiss","Russian", "Dutch"))
#if exclude NA for now
artworks_gender <- artworks_gender %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
ggplot(artworks_gender, aes(fct_reorder(Nationality, Frequency), Frequency, fill = Gender)) +
geom_bar(stat = "identity") +
ggtitle("MOMA All Time Favorite Nationality, Colored by Gender (top 5%)") +
xlab("Nationality") +
scale_fill_brewer(palette= "Oranges")+
geom_col(color = mycolor) +
coord_flip() +
theme(plot.title = element_text(face = "bold"))
So no.
aqr_freq <- artworks_single %>% group_by(YearAcquired) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
aqr_freq$YearAcquired <- strtoi(aqr_freq$YearAcquired)
ggplot(aqr_freq, aes(YearAcquired, Frequency)) +
geom_line(color = "blue") +
ggtitle("MOMA suddenly loved French Artworks around 1970", subtitle= "Acquisition Year Analysis on Nationality Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 1 rows containing missing values (geom_path).
# nationality frequency change over year
aqr_nationality <- artworks_single %>% group_by(YearAcquired, Nationality) %>% dplyr::summarise(Frequency = dplyr::n())
# exclude NA
aqr_nationality <- aqr_nationality %>% group_by(Nationality) %>% filter(Nationality %in% c("American", "French", "German","British", "Spanish", "Italian", "Japanese","Swiss","Russian", "Dutch"))
aqr_nationality$YearAcquired <- strtoi(aqr_nationality$YearAcquired)
ggplot(aqr_nationality, aes(YearAcquired, Frequency, color = Nationality)) +
geom_line() +
ggtitle("MOMA suddenly loved French Artworks around 1970", subtitle= "Acquisition Year Analysis on Nationality Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 10 rows containing missing values (geom_path).
# 100% stacked bar for collection of what year born
# mosaic -> female and male
aqr_gender <- artworks_single %>% group_by(YearAcquired, Gender) %>% dplyr::summarise(Frequency = dplyr::n())
# exclude NA
aqr_gender <- aqr_gender %>% group_by(Gender) %>% filter(Gender %in% c("female", "male"))
aqr_gender$YearAcquired <- strtoi(aqr_gender$YearAcquired)
ggplot(aqr_gender, aes(YearAcquired, Frequency, color = Gender)) +
geom_line() +
ggtitle("And those are mostly from Male Artists...", subtitle= "Acquisition Year Analysis on Gender Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 2 rows containing missing values (geom_path).
So the question becomes: who are those artists? A few of them? Or a bunch of them?
The years are 1968 and 1964.
# year around thos
peak_1970 <- artworks_single %>% group_by(YearAcquired, Nationality, Artist) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
peak_1970 <- peak_1970 %>% group_by(Nationality) %>% filter(Nationality == "French")
peak_1970 <- peak_1970 %>% group_by(YearAcquired) %>% filter(YearAcquired %in% c("1964", "1968"))
peak_1970 <- peak_1970 %>% group_by(Frequency) %>% filter(Frequency > 100)
peak_1970$YearAcquired <- strtoi(peak_1970$YearAcquired)
ggplot(peak_1970, aes(fct_reorder(Artist, Frequency), Frequency)) +
geom_bar(position = "dodge", stat = "identity") +
ggtitle("What happened in Year 1964 and 1968!?") +
labs(x = "Frequency", y = "Artist") +
geom_col(color = mycolor, fill = myfill) +
facet_wrap(~YearAcquired) +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
So we see that in year 1964, MOMA acquired artworks from different French artists, but in year 1968, they acquired mostly just from one artists, Eugène Atget.
# in addition, see those peak years for American artworks
us_artworks <- artworks_single %>% group_by(YearAcquired, Nationality, Artist) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
us_artworks <- us_artworks %>% group_by(Nationality) %>% filter(Nationality == "American")
us_artworks <- us_artworks %>% group_by(YearAcquired) %>% filter(YearAcquired %in% c("1974", "2008"))
us_artworks <- us_artworks %>% group_by(Frequency) %>% filter(Frequency > 100)
peak_1970$YearAcquired <- strtoi(peak_1970$YearAcquired)
ggplot(us_artworks, aes(fct_reorder(Artist, Frequency), Frequency)) +
geom_bar(position = "dodge", stat = "identity") +
ggtitle("What happened in Year 1964 and 1968!?") +
labs(x = "Frequency", y = "Artist") +
geom_col(color = mycolor, fill = myfill) +
facet_wrap(~YearAcquired) +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
# let's also see the overall artist number
artworks_artist <- artworks_single %>% group_by(Artist) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# filter out NAs
artworks_artist <- artworks_artist %>% group_by(Artist) %>% filter(is.na(Artist) == FALSE)
ggplot(tail(artworks_artist, 10), aes(fct_reorder(Artist, Frequency),Frequency)) +
geom_bar(position = "dodge", stat = "identity") +
ggtitle("Artists that Contributed Most Artworks to MOMA", subtitle = "Top 10 Artists Frequency Bar Chart") +
labs(x = "Frequency", y = "Artist") +
geom_col(color = mycolor, fill = myfill) +
coord_flip() +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
# see top classifications
class_freq <- artworks_single %>% group_by(Classification) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
# see what classification they like and how they change over year on the top 10
artworks_class <- artworks_single %>% group_by(YearAcquired, Classification) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
artworks_class$YearAcquired <- strtoi(artworks_class$YearAcquired)
artworks_class <- artworks_class %>% group_by(Classification) %>% filter(Classification %in% c("Print", "Photograph", "Illustrated Book","Drawing","Design", "Architecture","Painting", "Video"))
ggplot(artworks_class, aes(YearAcquired, Frequency, color = Classification)) +
geom_line() +
ggtitle("Illustrated Book and Photograph", subtitle= "Acquisition Year Analysis on Gender Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 8 rows containing missing values (geom_path).
# medium frequency
med_freq <- artworks_single %>% group_by(Medium) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
artworks_med <- artworks_single %>% group_by(YearAcquired, Medium) %>% dplyr::summarise(Frequency = dplyr::n())
# top 10 frequent medium
artworks_med <- artworks_med %>% group_by(Medium) %>% filter( Medium %in% c("Gelatin silver print", "Lithograph", "Albumen silver print","Pencil on paper","Letterpress", "Etching","Chromogenic color print", "Lithograph, printed in color"))
artworks_med$YearAcquired <- strtoi(artworks_med$YearAcquired)
ggplot(artworks_med, aes(YearAcquired, Frequency, color = Medium)) +
geom_line() +
ggtitle("Gelatin Silver Print can be their New Favorite!", subtitle= "Acquisition Year Analysis on Gender Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 7 rows containing missing values (geom_path).
aqr_birth <- artworks_single %>% group_by(YearAcquired,BeginDate) %>% dplyr::summarise(Frequency = dplyr::n()) %>% arrange(Frequency)
aqr_birth$YearAcquired <- strtoi(aqr_birth$YearAcquired)
ggplot(aqr_birth, aes(YearAcquired, Frequency, color = BeginDate)) +
geom_line() +
ggtitle("Gelatin Silver Print can be their New Favorite!", subtitle= "Acquisition Year Analysis on Gender Frequency Change") +
labs(x = "Year", y = "Frequency") +
theme_grey(14) +
theme(plot.title = element_text(face = "bold")) +
theme(plot.subtitle = element_text(face = "bold", color = "grey35")) +
theme(plot.caption = element_text(color = "grey68"))
## Warning: Removed 123 rows containing missing values (geom_path).
# trying to see if there is pattern of acq of artist birth year
artworks_mos <- artworks_single[c(3,4,6,9,10)]
artworks_mos <- artworks_mos %>% mutate(BirthIdx = (BeginDate < 1950))
artworks_mos$BirthIdx[(artworks_mos$BirthIdx)== TRUE] <- 1
artworks_mos$BirthIdx[(artworks_mos$BirthIdx)== FALSE] <- 3
artworks_mos <- artworks_mos %>% mutate(BirthType = cut(BirthIdx, breaks = c(0,1,Inf), labels = c("First Half", "Second Half")))
artworks_mos <- artworks_mos %>% group_by(Gender, BirthType) %>% dplyr::summarise(Frequency = dplyr::n())
## Warning: Factor `BirthType` contains implicit NA, consider using
## `forcats::fct_explicit_na`
vcd::mosaic(BirthType ~ Gender, artworks_mos)